#---------------------------------------------------------------------------
# Author          : Manasa Ramakrishna, mr325@le.ac.uk
# Date started  : 1st June, 2017
# Last modified : 7th June, 2017
# Aim             : To take a look at first SILAC labelled LOPIT data on Trizol
# Depends       : On 'silacFunctions.R'. Make sure they are in the same directory
# Notes         : Works on data from Rayner's first experiments
#--------------------------------------------------------------------------- 


# Invoking libraries
# source("https://bioconductor.org/biocLite.R")
# biocLite("Mus.musculus")
library(clusterProfiler)
## Loading required package: DOSE
## 
## DOSE v3.0.10  For help: https://guangchuangyu.github.io/DOSE
## 
## If you use DOSE in published research, please cite:
## Guangchuang Yu, Li-Gen Wang, Guang-Rong Yan, Qing-Yu He. DOSE: an R/Bioconductor package for Disease Ontology Semantic and Enrichment analysis. Bioinformatics 2015, 31(4):608-609
## clusterProfiler v3.2.14  For help: https://guangchuangyu.github.io/clusterProfiler
## 
## If you use clusterProfiler in published research, please cite:
## Guangchuang Yu., Li-Gen Wang, Yanyan Han, Qing-Yu He. clusterProfiler: an R package for comparing biological themes among gene clusters. OMICS: A Journal of Integrative Biology. 2012, 16(5):284-287.
library(ggplot2)
library(gplots)
## 
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
## 
##     lowess
library(limma)
library(org.Hs.eg.db)
## Loading required package: AnnotationDbi
## Loading required package: stats4
## Loading required package: BiocGenerics
## Loading required package: parallel
## 
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
## 
##     clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
##     clusterExport, clusterMap, parApply, parCapply, parLapply,
##     parLapplyLB, parRapply, parSapply, parSapplyLB
## The following object is masked from 'package:limma':
## 
##     plotMA
## The following objects are masked from 'package:stats':
## 
##     IQR, mad, xtabs
## The following objects are masked from 'package:base':
## 
##     anyDuplicated, append, as.data.frame, cbind, colnames,
##     do.call, duplicated, eval, evalq, Filter, Find, get, grep,
##     grepl, intersect, is.unsorted, lapply, lengths, Map, mapply,
##     match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
##     Position, rank, rbind, Reduce, rownames, sapply, setdiff,
##     sort, table, tapply, union, unique, unsplit, which, which.max,
##     which.min
## Loading required package: Biobase
## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.
## Loading required package: IRanges
## Loading required package: S4Vectors
## 
## Attaching package: 'S4Vectors'
## The following object is masked from 'package:gplots':
## 
##     space
## The following objects are masked from 'package:base':
## 
##     colMeans, colSums, expand.grid, rowMeans, rowSums
## 
library(outliers)
library(RColorBrewer)
library(reshape2)
library(stringr)
library(VennDiagram)
## Loading required package: grid
## Loading required package: futile.logger
library(wesanderson)

#Setting working directories
wd = "/Users/manasa/Documents/Work/TTT/02_Proteomics/01_First-SILAC-LOPIT/"
setwd(wd)
getwd()
## [1] "/Users/manasa/Documents/Work/TTT/02_Proteomics/01_First-SILAC-LOPIT"
indir = paste(wd,"Input",sep="/")
outdir = paste(wd,paste(Sys.Date(),"Output",sep = "_"),sep = "/")

if (exists(outdir)){
  print("Outdir exists")
}else{
  dir.create(outdir)
}
## Warning in dir.create(outdir): '/Users/manasa/Documents/Work/TTT/
## 02_Proteomics/01_First-SILAC-LOPIT//2017-06-14_Output' already exists
# Sourcing function file
source("silacFunctions.R")

# -----------
# Read data
# -----------

# File of contaminants - proteins to exclude from analysis as are things like keratin, alcohol dehydrogenase etc....
contam = read.delim("Input/Common contaminant_all.csv",sep=",",header=T)

# Read in the data files that contain peptide level output from Proteome discoverer...
# Modify the headers to be all lower case as well as remove unwanted spaces, symbols etc...to keep it simple
# Columns of interest are "sequence", "modifications","master.protein.accessions","abundance.heavy","abundance.light","abundance.ratio","quan.info"

infiles = grep("Trizol",list.files("Input/",full.names = T),value=T)
prot.data = NULL
for (i in infiles){
  in.dat = read.delim(i,sep="\t",comment.char="",as.is=T,header=F)
  in.dat$sample = strsplit(i,"//")[[1]][2]
  print(i)
  prot.data = rbind(prot.data,in.dat)
}
## [1] "Input//Trizol_150mJ_rep1.txt"
## [1] "Input//Trizol_150mJ_rep2.txt"
## [1] "Input//Trizol_150mJ_rep3.txt"
## [1] "Input//Trizol_275mJ_rep1.txt"
## [1] "Input//Trizol_275mJ_rep2.txt"
## [1] "Input//Trizol_275mJ_rep3.txt"
## [1] "Input//Trizol_400mJ_rep1.txt"
## [1] "Input//Trizol_400mJ_rep2.txt"
## [1] "Input//Trizol_400mJ_rep3.txt"
colnames(prot.data) = prot.data[1,]
dim(prot.data)
## [1] 81595    29
# Remove header lines as they differ in one of the columns (fraction number I think)
remove.head = which(prot.data[,1]=="Checked")
prot.data = prot.data[-(remove.head),]
dim(prot.data)
## [1] 81586    29
# Change header names a little to make them neutral and remove space, special characters
colnames(prot.data) = tolower(colnames(prot.data))
colnames(prot.data) = gsub(" ",".",colnames(prot.data))
colnames(prot.data) = gsub("#","no",colnames(prot.data))
colnames(prot.data) = gsub("\\:\\.f2\\:","n",colnames(prot.data))
colnames(prot.data)[12] = "theoretical.mass"
colnames(prot.data)[13] = "light.sample"
colnames(prot.data)[14] = "heavy.sample"
colnames(prot.data)[15] = "abundance.ratio.heavy.to.light"
colnames(prot.data)[16] = "abundance.light"
colnames(prot.data)[17] = "abundance.heavy"
colnames(prot.data)[29] = "sample"

# Convert abundance values to numeric from character
prot.data$abundance.heavy = as.numeric(prot.data$abundance.heavy)
prot.data$abundance.light = as.numeric(prot.data$abundance.light)
prot.data$abundance.ratio.heavy.to.light = as.numeric(prot.data$abundance.ratio.heavy.to.light)

# Add rep, reagent and UV amount columns
prot.data$uv = sapply(strsplit(prot.data$sample,"_"),"[[",2)
prot.data$repl = gsub(".txt","",gsub("rep","",sapply(strsplit(prot.data$sample,"_"),"[[",3)))
prot.data$repl = paste(prot.data$uv,prot.data$rep,sep=".")
prot.data$reagent = sapply(strsplit(prot.data$sample,"_"),"[[",1)
head(prot.data)
##   checked confidence            sequence              modifications
## 2   FALSE       High          AGAHLQGGAK                           
## 3   FALSE       High        IMNTFSVVPSPK 1xLabel:13C(6)15N(2) [K12]
## 4   FALSE       High        IMNTFSVVPSPK                           
## 5   FALSE       High    NQVTQLKEQVPGFTPR                           
## 6   FALSE       High EQELQQTLQQEQSVLDQLR                           
## 7   FALSE       High      TTPSVVAFTADGER                           
##   qvality.pep qvality.q-value no.protein.groups no.proteins no.psms
## 2 5.61944E-06               0                 1           2      16
## 3  3.4751E-06               0                 4          11       8
## 4 2.42801E-06               0                 4          11       6
## 5 0.000142696               0                 1           2       4
## 6 1.97349E-07               0                 1           1       6
## 7 9.80731E-06               0                 1           3       4
##        master.protein.accessions no.missed.cleavages theoretical.mass
## 2                         P04406                   0      909.4900898
## 3 Q13509; P04350; P07437; P68371                   0      1327.716983
## 4 Q13509; P04350; P07437; P68371                   0      1319.702784
## 5                         F5H2F4                   1      1841.986822
## 6                         Q15149                   0      2313.168093
## 7                         P38646                   0      1450.717248
##   light.sample heavy.sample abundance.ratio.heavy.to.light abundance.light
## 2         High   Peak Found                           0.01         5200000
## 3    Not Found    Not Found                             NA              NA
## 4    Not Found    Not Found                             NA              NA
## 5    Not Found    Not Found                             NA              NA
## 6         High    Not Found                           0.01          656400
## 7         High    Not Found                             NA              NA
##   abundance.heavy      quan.info amanda.score.ms.amanda
## 2           43050         Unique             135.209857
## 3              NA No Quan Values            201.8662137
## 4              NA No Quan Values            176.2597659
## 5              NA No Quan Values            127.0712654
## 6              NA         Unique            152.1398947
## 7              NA No Quan Values            104.2831076
##   confidence.ms.amanda search.space.ms.amanda percolator.q-value.ms.amanda
## 2                 High                   1636                            0
## 3                 High                   2601                            0
## 4                 High                   2728                            0
## 5                 High                   3048                            0
## 6                 High                   3758                            0
## 7                 High                   2414                            0
##   percolator.pep.ms.amanda ions.score.mascot confidence.mascot
## 2               0.00001018             65.04              High
## 3              0.000002614             75.28              High
## 4               0.00001715             79.87              High
## 5               0.00005757             40.09              High
## 6                1.928E-07             44.95              High
## 7               0.00002452             48.27              High
##   search.space.mascot percolator.q-value.mascot percolator.pep.mascot
## 2                                             0             2.945E-07
## 3                                             0             1.695E-07
## 4                                             0             1.112E-07
## 5                                             0            0.00003356
## 6                                             0             1.211E-08
## 7                                             0             5.664E-07
##                  sample    uv    repl reagent
## 2 Trizol_150mJ_rep1.txt 150mJ 150mJ.1  Trizol
## 3 Trizol_150mJ_rep1.txt 150mJ 150mJ.1  Trizol
## 4 Trizol_150mJ_rep1.txt 150mJ 150mJ.1  Trizol
## 5 Trizol_150mJ_rep1.txt 150mJ 150mJ.1  Trizol
## 6 Trizol_150mJ_rep1.txt 150mJ 150mJ.1  Trizol
## 7 Trizol_150mJ_rep1.txt 150mJ 150mJ.1  Trizol
dim(prot.data)
## [1] 81586    32

prot.data has 32 columns and 81,586 rows - each row belonging to a peptide. We now go through a series of filtering steps to obtain a d

# ------------------
# Step 1 : Filter 
# ------------------

# Step 1a : Filter only for those peptides that have a unique master protein. Done using column "quan.info" and titled 'Unique'
dim(prot.data)
## [1] 81586    32
peptide.stats = table(prot.data$sample,prot.data$quan.info)
peptide.stats
##                        
##                         No Quan Values Not Unique Unique
##   Trizol_150mJ_rep1.txt           5211        287   4088
##   Trizol_150mJ_rep2.txt           3869        248   3048
##   Trizol_150mJ_rep3.txt           4571        247   3483
##   Trizol_275mJ_rep1.txt           5266        289   3845
##   Trizol_275mJ_rep2.txt           4303        212   3091
##   Trizol_275mJ_rep3.txt           6578        468   6383
##   Trizol_400mJ_rep1.txt           5365        286   4129
##   Trizol_400mJ_rep2.txt           5203        265   3422
##   Trizol_400mJ_rep3.txt           4320        219   2890
filt.1a = prot.data[which(prot.data$quan.info == "Unique"),]
length(which(filt.1a$quan.info == "Unique"))
## [1] 34379
dim(filt.1a) #34279 are unique proteins, 47207 are non-unique or are missing values 
## [1] 34379    32
# This table is very odd. Rayner had an explanation but I'm still muddled. He said "High" was euivalent to "Peak found"
# However, there are peptides where it is "High" but the peptide values are NA. Hmmm....
table(light=filt.1a$light.sample,heavy=filt.1a$heavy.sample)
##             heavy
## light         High Not Found Peak Found
##   High           0     15810      11775
##   Not Found   1672         0          0
##   Peak Found  5122         0          0
# Step 1b : Filter out those proteins that are contaminants from the contaminants list and annotate missing values
filt.1b = filt.1a[-which(filt.1a$master.protein.accessions %in% contam$Protein.Group.Accessions),]
num.contams = length(which(filt.1a$master.protein.accessions %in% contam$Protein.Group.Accessions))

# Annotate which peptides are missing heavy, light or both, abundance values
filt.1b$missing.val = rowSums(is.na(filt.1b[,c("abundance.heavy", "abundance.light")])) > 0

dim(filt.1a) # 34379 in total
## [1] 34379    32
dim(filt.1b) # 33657 filtered proteins
## [1] 33657    33
print(num.contams) # 722 contaminant proteins
## [1] 722
# Want to do some stats with missing values. 
table(filt.1b$sample,filt.1b$missing.val) # More missing values in 150mJ_rep, 275mK_rep2 and 450mJ_rep3. However ration of missing/non-missing is same
##                        
##                         FALSE TRUE
##   Trizol_150mJ_rep1.txt  1914 2097
##   Trizol_150mJ_rep2.txt  1460 1523
##   Trizol_150mJ_rep3.txt  1667 1723
##   Trizol_275mJ_rep1.txt  1836 1915
##   Trizol_275mJ_rep2.txt  1358 1670
##   Trizol_275mJ_rep3.txt  3055 3175
##   Trizol_400mJ_rep1.txt  2024 2031
##   Trizol_400mJ_rep2.txt  1572 1797
##   Trizol_400mJ_rep3.txt  1428 1412
round(table(filt.1b$sample,filt.1b$missing.val)/rowSums(table(filt.1b$sample,filt.1b$missing.val))*100,2)
##                        
##                         FALSE  TRUE
##   Trizol_150mJ_rep1.txt 47.72 52.28
##   Trizol_150mJ_rep2.txt 48.94 51.06
##   Trizol_150mJ_rep3.txt 49.17 50.83
##   Trizol_275mJ_rep1.txt 48.95 51.05
##   Trizol_275mJ_rep2.txt 44.85 55.15
##   Trizol_275mJ_rep3.txt 49.04 50.96
##   Trizol_400mJ_rep1.txt 49.91 50.09
##   Trizol_400mJ_rep2.txt 46.66 53.34
##   Trizol_400mJ_rep3.txt 50.28 49.72
# How many missing in heavy, how many missing in light
miss.l = table("_Missing light values_"=filt.1b$missing.val,filt.1b$light.sample)
miss.h = table("_Missing heavy values_"=filt.1b$missing.val,filt.1b$heavy.sample)
miss = cbind(miss.l,miss.h)
colnames(miss) = c("Light_High","Light_NotFound","Light_Found","Heavy_High","Heavy_NotFound","Heavy_Found")
rownames(miss) = c("notMissing","missing")
print(miss)
##            Light_High Light_NotFound Light_Found Heavy_High Heavy_NotFound
## notMissing      11258              0        5056       5056              0
## missing         15644           1658          41       1699          15215
##            Heavy_Found
## notMissing       11258
## missing            429
# Plot density plots 
melt.1b = melt(filt.1b,id.vars = "repl", measure.vars = c("abundance.light", "abundance.heavy"))


ggplot(melt.1b,aes(x = log2(value+0.1))) + geom_density(aes(col = as.factor(variable)))+facet_wrap(~repl,ncol=3)+scale_colour_brewer(palette = "Dark2")
## Warning: Removed 17343 rows containing non-finite values (stat_density).

ggplot(filt.1b,aes(x = log2(abundance.ratio.heavy.to.light+0.1))) + geom_density(aes(col = as.factor(repl)))+facet_wrap(~repl,ncol=3)

ggplot(melt.1b,aes(x=repl,y = log2(value+0.1))) + geom_violin(aes(col = as.factor(variable)),draw_quantiles = c(0.25, 0.5, 0.75))
## Warning: Removed 17343 rows containing non-finite values (stat_ydensity).

We have a column called “missing.val” to identify which peptides have either a heavy or light abundance value missing. TRUE means it is missing one or both. FALSE means both values are present. A lot more “missing” values in the “heavy/non-crosslinked samples than”light/cross-linked samples“.

The above plots - density plots and violin plots include both missing and non-missing values. Hence, in the heavy/light density plots, you see a huge overlap in the curve with a tiny portion of the “light” curve going beyond the heavy curve. This is where we expect the interesting RNA binding proteins to lie. The next step however is to filter out the missing values.

# Step 1c : Filter out those peptides which are missing either "high" or "low" abundance values. We will look at this separately
# as we do not know for sure whether these are a result of extremely low signal due to enrichment or extremely low signal due to technical effects.
# There are 16314 peptides where we have quantification in both light and heavy abundance columns
filt.1c = filt.1b[which(rowSums(is.na(filt.1b[,c("abundance.heavy", "abundance.light")])) == 0),]
dim(filt.1c)
## [1] 16314    33
# Plot density plots 
melt.1c = melt(filt.1c,id.vars = "repl", measure.vars = c("abundance.light", "abundance.heavy"))
  
ggplot(melt.1c,aes(x = log2(value+0.1))) + geom_density(aes(col = as.factor(variable)))+facet_wrap(~repl,ncol=3)+scale_colour_brewer(palette = "Dark2")

ggplot(filt.1c,aes(x = log2(abundance.ratio.heavy.to.light+0.1))) + geom_density(aes(col = as.factor(repl)))+facet_wrap(~repl,ncol=3)

ggplot(melt.1c,aes(x=repl,y = log2(value+0.1))) + geom_violin(aes(col = as.factor(variable)),draw_quantiles = c(0.5))

Once we remove peptides where the “heavy” or “light” value is missing, then there is a clear shift in the curve of intensity values for the “light” labelled sample which is our cross-linked sample and we hope it contains true RNA binding proteins. The median abundance for light samples is visibily higher than in heavy samples.

Note : It is important to remember that in a true experimental setting, we will not have SILAC labelling so we won’t have “heavy” and “light” values per peptide - all we will have is one abundance value. Rayner forced the mass spec to run as if it didn’t know about the SILAC labelling to parially emulate later experiments. However, we won’t be using the singleton data (heacvy only or light only) for the purposes of this initial analysis.

# ------------------
# Step 2 : Log-transform 
# heavy = non-crosslinked
# light = crosslinked
# ------------------

# Log convert abundance values
filt.1c$heavy.log = log(filt.1c$abundance.heavy,2)
filt.1c$light.log = log(filt.1c$abundance.light,2)

# Generate an abundance ratio which for log transformed data is a subtraction
filt.1c$norm.abundance.ratio = filt.1c$light.log - filt.1c$heavy.log

# Data is checked
norm.data = filt.1c
dim(norm.data)
## [1] 16314    36
# Checking the counts of peaks with heavy and light values
table(light=norm.data$light.sample,heavy=norm.data$heavy.sample)
##             heavy
## light         High Peak Found
##   High           0      11258
##   Peak Found  5056          0

Once we have filtered the data to remove non-unique peptides and contaminants, we log transform (“normalise”) the data for heavy and light abundances. In addition, we subtract the logged abundances light-heavy to yield logged abundance ratios.

When we re-draw the table of heavy and light sample counts, we don’t have any “Not Found” values anymore. This was part of the exercise.

We have 16314 peptides that are present in both fractions. Analysing the difference in ratios between these two fractions are most likely to inform on whether or not they are enriched for RNA binding proteins.

# -----------------------------------------------
# Step 3 : Aggregating multiple peptides into one peptide
# heavy = non-crosslinked
# light = crosslinked
# -----------------------------------------------

# Subset the data to include columns with useful metadata and abundance ratios
# Transform the dataframe using 'melt' so the values for heavy and light are in one column. Can use this to draw plots comparing heavy to light values when necessary. 

subset.cols = norm.data[,c("master.protein.accessions","sequence","modifications","repl","uv","missing.val","light.log","heavy.log","norm.abundance.ratio")]
dim(subset.cols)
## [1] 16314     9
# Tried various methods of aggregation 
# Using sequence and repeat columns to aggregate
# Using all columns but the abundance ratio columns to aggregate
# Using mean, median or max to aggregate
# Using ddply as an alternative to aggregate
# Note : Finally, settled on taking the mean of the logged values and using 'aggregate' function

# We have 15356 unique peptide groups across all samples
agg.mean = aggregate(cbind(light.log,heavy.log,norm.abundance.ratio)~sequence+repl,data=subset.cols,FUN="mean")
agg.pep.table = table(agg.mean$sequence,agg.mean$repl)
table(agg.mean$repl) # 275mJ, replicate 3 has a lot more peptide groups than other samples
## 
## 150mJ.1 150mJ.2 150mJ.3 275mJ.1 275mJ.2 275mJ.3 400mJ.1 400mJ.2 400mJ.3 
##    1802    1366    1590    1728    1275    2882    1894    1466    1353
# Now that peptides have been aggregated into peptide groups, re-calculate the missing value table...
write.table(agg.pep.table, paste(outdir,"Aggregated-pepides-no-missing-values.txt",sep="/"),sep="\t",quote=F)

# Will go with agg.mean for further analysis
agg = agg.mean

# I want to add protein annotations back to the aggregated data. Just want to make sure that one peptide doesn't map to multiple proteins. It shouldn't as we have only selected unique ones. Checked and is true. 
for(i in 1:nrow(agg)){
  agg$num.prot[i] = length(unique(subset.cols$master.protein.accessions[which(subset.cols$sequence == agg$sequence[i] & subset.cols$repl == agg$repl[i])]))
  agg$accessions[i] = paste(unique(subset.cols$master.protein.accessions[which(subset.cols$sequence == agg$sequence[i] & subset.cols$repl == agg$repl[i])]),collapse=",")
}

head(agg)
##                            sequence    repl light.log heavy.log
## 1                       AAAAAAALQAK 150mJ.1  27.91815  24.91056
## 2                         AAAETQSLR 150mJ.1  26.22839  22.15931
## 3                        AAAMANNLQK 150mJ.1  24.43873  20.25003
## 4 AAEAAPPTQEAQGETEPTEQAPDALEQAADTSR 150mJ.1  19.35638  19.07300
## 5                          AAGPISER 150mJ.1  22.39100  22.81245
## 6                  AAGPSLSHTSGGTQSK 150mJ.1  20.91634  17.96804
##   norm.abundance.ratio num.prot accessions
## 1            3.0075883        1     P36578
## 2            4.0690834        1     Q13523
## 3            4.1886989        1     Q14498
## 4            0.2833818        1     Q8N163
## 5           -0.4214498        1     Q15427
## 6            2.9483034        1     P27694
table(agg.mean$repl)
## 
## 150mJ.1 150mJ.2 150mJ.3 275mJ.1 275mJ.2 275mJ.3 400mJ.1 400mJ.2 400mJ.3 
##    1802    1366    1590    1728    1275    2882    1894    1466    1353
# Temporarily recast data into a matrix to calculate correlations

for (t in c("light.log","heavy.log","norm.abundance.ratio")){
  m = melt(agg,id.vars = c("sequence","repl"), measure.vars = t)
  m.cast = dcast(m, sequence~repl+variable, fun.aggregate = mean)
  cor.m = cor(m.cast[,2:ncol(m.cast)],use="pairwise.complete.obs")
  colnames(cor.m) = gsub(paste("_",t,sep=""),"",colnames(cor.m))
  rownames(cor.m) = gsub(paste("_",t,sep=""),"",rownames(cor.m))
  heatmap.2(cor.m,trace = "none", dendrogram="none",col="redgreen", main=t)
}

Looking at the correlations for ‘heavy’ and ‘light’ abundance values across all replicates, it looks like the correlation is more within experimental replicates i.e high for rep1 of 150mJ, 275mJ, 400mJ than between 150mJ.rep1 and 150mJ.rep2 and so on.

# -----------------------------------------------
# Step 4 : Aggregating multiple peptides into one protein 
# heavy = non-crosslinked
# light = crosslinked
# -----------------------------------------------

# We have 1262 unique proteins across all samples
agg.prot = aggregate(cbind(light.log,heavy.log,norm.abundance.ratio)~accessions+repl,data=agg,FUN="mean")
dim(agg.prot)
## [1] 5749    5
# Table of proteins vs samples - contingency to say which protein is present in which sample. 
# Will help make overlaps
agg.prot.table = table(agg.prot$accessions,agg.prot$repl)
write.table(agg.prot.table, paste(outdir,"Aggregated-proteins-no-missing-values.txt",sep="/"),sep="\t",quote=F)
table(agg.prot$repl)
## 
## 150mJ.1 150mJ.2 150mJ.3 275mJ.1 275mJ.2 275mJ.3 400mJ.1 400mJ.2 400mJ.3 
##     676     544     600     648     511     943     696     594     537

There seem to be on average, ~640 proteins per sample in this experiment. 275mJ, rep3 has an unusually high number at 943. The samples at 150mJ of UV exposure have ~605 proteins, 275mJ have on average 716 proteins and 400mJ have on average 610 proteins.

# Now that we have the proteins, what are they doing functionally

# First let us look at the intersects within and between replicates
prot.matrix = as.data.frame.matrix(agg.prot.table)
print(dim(prot.matrix))
## [1] 1262    9
# Contains counts of overlap across 9 samples in various combinations
# Most intersections not very useful except that it tell us how many proteins overlap across all 9 samples = 211
prot.venn = venn(prot.matrix,show.plot=F)
isect = attr(prot.venn,"intersections")

# table of intersections
isect.count = t(as.data.frame(lapply(isect,length))) 
colnames(isect.count) = "Count"
write.table(isect.count, paste(outdir,"Count-of-protein-overlaps-across-various-samples.txt",sep="/"),sep="\t",quote=F)

# Looking at overlaps within each uv dose - the more useful intersection exercise
add.int = NULL 

# Looping through each uv dosage triplicate - 1:3, 4:6, 7:9
# add.int contains all intersections for each triplicate
for(k in c(1,4,7)){
  print(k)
  prot.venn.tmp = venn(prot.matrix[,k:(k+2)],show.plot=F)
  vennDiagram(prot.matrix[,k:(k+2)],circle.col=c("turquoise", "salmon","palegreen"))
  add.int = c(add.int,attr(prot.venn.tmp,"intersections"))
}
## [1] 1

## [1] 4

## [1] 7

write.table(t(as.data.frame(lapply(add.int,length))), paste(outdir,"Count-of-protein-overlaps-within-repl-for-each-dose.txt",sep="/"),sep="\t",quote=F)

The venn diagrams show the overlap of proteins within each uv dosage across replicates. There are between 350 and 370 overlapping proteins within each UV dosage. Across all 9 replicates, there are 211 proteins that we can extract as shown below. The next step is to map these proteins to some functional annotations. We will map each interaction group separately to see what it yields. Will use ‘clusterProfiler’ to do this.

isect[280] $150mJ.1:150mJ.2:150mJ.3:275mJ.1:275mJ.2:275mJ.3:400mJ.1:400mJ.2:400mJ.3 [1] “A0A024R4E5” “A0A087WUT6” “A0A087WVQ6” “A0A087X0X3” “A0A0A0MRV0” “A0A0C4DG17” “A0A0C4DG49” “A0A0D9SF53” “A0A0G2JNW7” “A0A0U1RRM4” “A8MXP9” “E7EVA0”
[13] “F5H2F4” “F8W930” “G8JLB6” “H3BLZ8” “J3KPP4” “J3KTA4” “M0QYS1” “O00203” “O00567” “O15479” “O43175” “O43390”
[25] “O43493” “O60506” “O75369” “O75400” “O75533” “O75534” “O76021” “O95218” “P02545” “P02786” “P04406” “P04792”
[37] “P05023” “P05556” “P06748” “P06756” “P07195” “P07237” “P07355” “P07737” “P07814” “P07900” “P08238” “P08621”
[49] “P08670” “P09429” “P09619” “P09651” “P0C7U0” “P10809” “P10909” “P11047” “P11142” “P11387” “P11940” “P13639”
[61] “P13667” “P14618” “P14625” “P14866” “P15880” “P16070” “P16989” “P18124” “P18583” “P18827” “P19338” “P20700”
[73] “P21399” “P22314” “P22626” “P23246” “P23396” “P23526” “P26006” “P26373” “P26641” “P30101” “P31942” “P31948”
[85] “P32004” “P35052” “P35579” “P35613” “P35659” “P36578” “P37802” “P38646” “P39023” “P42704” “P42892” “P43121”
[97] “P46777” “P46781” “P46976” “P47914” “P48634” “P49327” “P49411” “P49588” “P49756” “P50454” “P50895” “P50914”
[109] “P50990” “P51991” “P52597” “P53396” “P55011” “P55072” “P55290” “P55795” “P60709” “P61247” “P61978” “P62241”
[121] “P62249” “P62263” “P62269” “P62424” “P62753” “P62913” “P62995” “P67809” “P78527” “Q00839” “Q01105” “Q01130”
[133] “Q01844” “Q02878” “Q05519” “Q07065” “Q07666” “Q07954” “Q08170” “Q08211” “Q08945” “Q09666” “Q12849” “Q12906”
[145] “Q13148” “Q13151” “Q13243” “Q13263” “Q13283” “Q13641” “Q13740” “Q14103” “Q14108” “Q14152” “Q14315” “Q14444”
[157] “Q14498” “Q14690” “Q15061” “Q15084” “Q15149” “Q15233” “Q15287” “Q15717” “Q15758” “Q15904” “Q16629” “Q16658”
[169] “Q5BKZ1” “Q5T6F2” “Q6PD62” “Q6UVK1” “Q7KZF4” “Q7L2E3” “Q7L4I2” “Q7Z3B1” “Q86SJ2” “Q8N7H5” “Q8NC51” “Q8NE71”
[181] “Q8WVC0” “Q92541” “Q92879” “Q92945” “Q96AE4” “Q96I24” “Q96KR1” “Q96PK6” “Q96T37” “Q99700” “Q99714” “Q9BRL6”
[193] “Q9BUQ8” “Q9H307” “Q9NR30” “Q9NUM4” “Q9NWH9” “Q9NZB2” “Q9P121” “Q9UKM9” “Q9UQ35” “Q9UQ80” “Q9Y2W1” “Q9Y2X3”
[205] “Q9Y383” “Q9Y3Y2” “Q9Y490” “Q9Y4C8” “Q9Y4L1” “Q9Y520” “X5DQS5”

# KEGG enrichment for the intersections
# Let's see if we can work out whether the proteins are going up or down in light:heavy or crosslink:non-crosslink
# Main interactions of interest are (1) across all 9 samples (n = 211) (2) Overlap within each triplicate - 150mJ = 369, 275mJ = 358

prot.univ = bitr(unique(filt.1a$master.protein.accessions), fromType="UNIPROT", toType=c("ENTREZID"), OrgDb="org.Hs.eg.db") # 1157/1262 genes have Entrez Ids
## 'select()' returned 1:many mapping between keys and columns
## Warning in bitr(unique(filt.1a$master.protein.accessions), fromType =
## "UNIPROT", : 10.7% of input gene IDs are fail to map...
across.9.kegg = enrichK(isect,280,agg.prot,0.05,outdir)

## 'select()' returned 1:many mapping between keys and columns
## Warning in bitr(down.in.crosslink, fromType = "UNIPROT", toType =
## c("ENTREZID"), : 9.09% of input gene IDs are fail to map...
## 'select()' returned 1:many mapping between keys and columns
## Warning in bitr(up.in.crosslink, fromType = "UNIPROT", toType =
## c("ENTREZID"), : 5.21% of input gene IDs are fail to map...
## 'select()' returned 1:many mapping between keys and columns
## 'select()' returned 1:1 mapping between keys and columns
## 'select()' returned 1:many mapping between keys and columns
## 'select()' returned 1:1 mapping between keys and columns
## 'select()' returned 1:many mapping between keys and columns
## 'select()' returned 1:1 mapping between keys and columns
## 'select()' returned 1:many mapping between keys and columns
## 'select()' returned 1:1 mapping between keys and columns
## 'select()' returned 1:many mapping between keys and columns
## 'select()' returned 1:1 mapping between keys and columns
## 'select()' returned 1:many mapping between keys and columns
## 'select()' returned 1:1 mapping between keys and columns
## 'select()' returned 1:many mapping between keys and columns
## 'select()' returned 1:1 mapping between keys and columns

across.150mJ = enrichK(add.int,"150mJ.1:150mJ.2:150mJ.3",agg.prot,0.05,outdir)

## 'select()' returned 1:many mapping between keys and columns
## Warning in bitr(down.in.crosslink, fromType = "UNIPROT", toType =
## c("ENTREZID"), : 12.5% of input gene IDs are fail to map...
## 'select()' returned 1:many mapping between keys and columns
## Warning in bitr(up.in.crosslink, fromType = "UNIPROT", toType =
## c("ENTREZID"), : 6.23% of input gene IDs are fail to map...
## 'select()' returned 1:many mapping between keys and columns
## 'select()' returned 1:1 mapping between keys and columns
## 'select()' returned 1:many mapping between keys and columns
## 'select()' returned 1:1 mapping between keys and columns
## 'select()' returned 1:many mapping between keys and columns
## 'select()' returned 1:1 mapping between keys and columns
## 'select()' returned 1:many mapping between keys and columns
## 'select()' returned 1:1 mapping between keys and columns
## 'select()' returned 1:many mapping between keys and columns
## 'select()' returned 1:1 mapping between keys and columns
## 'select()' returned 1:many mapping between keys and columns
## 'select()' returned 1:1 mapping between keys and columns
## 'select()' returned 1:many mapping between keys and columns
## 'select()' returned 1:1 mapping between keys and columns
## 'select()' returned 1:many mapping between keys and columns
## 'select()' returned 1:1 mapping between keys and columns
## 'select()' returned 1:many mapping between keys and columns
## 'select()' returned 1:1 mapping between keys and columns
## 'select()' returned 1:many mapping between keys and columns
## 'select()' returned 1:1 mapping between keys and columns
## 'select()' returned 1:many mapping between keys and columns
## 'select()' returned 1:1 mapping between keys and columns
## 'select()' returned 1:many mapping between keys and columns
## 'select()' returned 1:1 mapping between keys and columns
## 'select()' returned 1:many mapping between keys and columns
## 'select()' returned 1:1 mapping between keys and columns
## 'select()' returned 1:many mapping between keys and columns
## 'select()' returned 1:1 mapping between keys and columns
## 'select()' returned 1:many mapping between keys and columns
## 'select()' returned 1:1 mapping between keys and columns
## 'select()' returned 1:many mapping between keys and columns
## 'select()' returned 1:1 mapping between keys and columns
## 'select()' returned 1:1 mapping between keys and columns
## 'select()' returned 1:1 mapping between keys and columns
## 'select()' returned 1:many mapping between keys and columns
## 'select()' returned 1:1 mapping between keys and columns

across.275mJ = enrichK(add.int,"275mJ.1:275mJ.2:275mJ.3",agg.prot,0.05,outdir)

## 'select()' returned 1:many mapping between keys and columns
## Warning in bitr(down.in.crosslink, fromType = "UNIPROT", toType =
## c("ENTREZID"), : 13.64% of input gene IDs are fail to map...
## 'select()' returned 1:many mapping between keys and columns
## Warning in bitr(up.in.crosslink, fromType = "UNIPROT", toType =
## c("ENTREZID"), : 5.59% of input gene IDs are fail to map...
## 'select()' returned 1:many mapping between keys and columns
## 'select()' returned 1:1 mapping between keys and columns
## 'select()' returned 1:many mapping between keys and columns
## 'select()' returned 1:1 mapping between keys and columns
## 'select()' returned 1:many mapping between keys and columns
## 'select()' returned 1:1 mapping between keys and columns
## 'select()' returned 1:many mapping between keys and columns
## 'select()' returned 1:1 mapping between keys and columns
## 'select()' returned 1:many mapping between keys and columns
## 'select()' returned 1:1 mapping between keys and columns
## 'select()' returned 1:many mapping between keys and columns
## 'select()' returned 1:1 mapping between keys and columns
## 'select()' returned 1:many mapping between keys and columns
## 'select()' returned 1:1 mapping between keys and columns
## 'select()' returned 1:many mapping between keys and columns
## 'select()' returned 1:1 mapping between keys and columns
## 'select()' returned 1:many mapping between keys and columns
## 'select()' returned 1:1 mapping between keys and columns
## 'select()' returned 1:many mapping between keys and columns
## 'select()' returned 1:1 mapping between keys and columns
## 'select()' returned 1:many mapping between keys and columns
## 'select()' returned 1:1 mapping between keys and columns
## 'select()' returned 1:1 mapping between keys and columns
## 'select()' returned 1:1 mapping between keys and columns

across.400mJ = enrichK(add.int,"400mJ.1:400mJ.2:400mJ.3",agg.prot,0.05,outdir)

## 'select()' returned 1:many mapping between keys and columns
## Warning in bitr(down.in.crosslink, fromType = "UNIPROT", toType =
## c("ENTREZID"), : 8.7% of input gene IDs are fail to map...
## 'select()' returned 1:many mapping between keys and columns
## Warning in bitr(up.in.crosslink, fromType = "UNIPROT", toType =
## c("ENTREZID"), : 6.9% of input gene IDs are fail to map...
## 'select()' returned 1:many mapping between keys and columns
## 'select()' returned 1:1 mapping between keys and columns
## 'select()' returned 1:many mapping between keys and columns
## 'select()' returned 1:1 mapping between keys and columns
## 'select()' returned 1:many mapping between keys and columns
## 'select()' returned 1:1 mapping between keys and columns
## 'select()' returned 1:many mapping between keys and columns
## 'select()' returned 1:1 mapping between keys and columns
## 'select()' returned 1:many mapping between keys and columns
## 'select()' returned 1:1 mapping between keys and columns
## 'select()' returned 1:many mapping between keys and columns
## 'select()' returned 1:1 mapping between keys and columns
## 'select()' returned 1:many mapping between keys and columns
## 'select()' returned 1:1 mapping between keys and columns
## 'select()' returned 1:many mapping between keys and columns
## 'select()' returned 1:1 mapping between keys and columns
## 'select()' returned 1:many mapping between keys and columns
## 'select()' returned 1:1 mapping between keys and columns
## 'select()' returned 1:many mapping between keys and columns
## 'select()' returned 1:1 mapping between keys and columns

all.kegg = rbind(across.9.kegg,across.150mJ,across.275mJ,across.400mJ)
write.table(all.kegg, paste(outdir,"KEGG-enrichment-for-enriched-proteins.txt",sep="/"),sep="\t",quote=F)

Not sure what to define the protein “universe” as. Used all of the proteins in the aggregaed list but this is not sufficient to run the KEGG analysis (throws a “not suffiecient members in group” error. Need to read a bit more about the inner workings of enrichKEGG to see if this can be changed.

Meanwhile, the overlapping proteins across all samples are enriched for the terms “Ribosome”,“Spliceosome”,“Protein processing in ER”,“Cell adhesion molecules” etc…Rayner concerned about presence of proteoglycans as these could be unwanted members entering the interface. Experiments are underway to check this.

I have also done an enrichment for proteins that were common within triplicate and each UV dosage. Get very similar terms as before (which is expected) and a few extra. The 150mJ dosage has the most number of significant KEGG mappings of the three dosages. There are a few pathways that aren’t enriched in the crosslinked sample (Down) but majority are. There are instances where the term “splisosome” appears in both enriched and un-enriched categories but the genes that contribute to this KEGG term are different in the enriched and unenriched cases. Might be worth pursuing these genes that in the unenriched category - they are heterogeneous nuclear riboneucleo protein and small nuclear ribonucleoprotein, RNA helicase and splicing factor subunit.

The 275mJ dosage has a high number of histones which map to pathways such as Systemic lupus erythematosus, Viral carcinogenesis, ECM-receptor interaction and Alcoholism which are a bit odd. If you remember, 275mJ has on average more proteins per sample than the other two time points. Perhaps this isn’t the ideal UV dosage for the study.